In [1]:
#Importing all required library

import nltk
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud, STOPWORDS
In [9]:
from tqdm import tqdm 
import re 
import nltk 
nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('wordnet')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer 
[nltk_data] Downloading package punkt to C:\Users\Amrendra
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Amrendra
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Amrendra
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Dataset1¶

1.importing 1st dataset 2.EDA on 1st dataset 3.Data pre-processing

In [3]:
Dataset1 = pd.read_csv("newss.csv")
In [8]:
import pandas as pd
import numpy as np
In [4]:
Dataset1.head()
Out[4]:
Unnamed: 0 title text label
0 8476 You Can Smell Hillary’s Fear Daniel Greenfield, a Shillman Journalism Fello... FAKE
1 10294 Watch The Exact Moment Paul Ryan Committed Pol... Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE
2 3608 Kerry to go to Paris in gesture of sympathy U.S. Secretary of State John F. Kerry said Mon... REAL
3 10142 Bernie supporters on Twitter erupt in anger ag... — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE
4 875 The Battle of New York: Why This Primary Matters It's primary day in New York and front-runners... REAL
In [5]:
Dataset1.tail()
Out[5]:
Unnamed: 0 title text label
6330 4490 State Department says it can't find emails fro... The State Department told the Republican Natio... REAL
6331 8062 The ‘P’ in PBS Should Stand for ‘Plutocratic’ ... The ‘P’ in PBS Should Stand for ‘Plutocratic’ ... FAKE
6332 8622 Anti-Trump Protesters Are Tools of the Oligarc... Anti-Trump Protesters Are Tools of the Oligar... FAKE
6333 4021 In Ethiopia, Obama seeks progress on peace, se... ADDIS ABABA, Ethiopia —President Obama convene... REAL
6334 4330 Jeb Bush Is Suddenly Attacking Trump. Here's W... Jeb Bush Is Suddenly Attacking Trump. Here's W... REAL
In [5]:
Dataset1.nunique()
Out[5]:
Unnamed: 0    6335
title         6256
text          6060
label            2
dtype: int64
In [6]:
Dataset1["Article"] = Dataset1["title"] + Dataset1["text"]
Dataset1.sample(frac = 1) #Shuffle 100%

Dataset1.label[Dataset1.label == 'REAL'] = 1
Dataset1.label[Dataset1.label == 'FAKE'] = 0

Dataset1 = Dataset1.loc[:,['Article','label']]
Dataset1 = Dataset1.dropna()
<ipython-input-6-586347da7020>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Dataset1.label[Dataset1.label == 'REAL'] = 1
<ipython-input-6-586347da7020>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Dataset1.label[Dataset1.label == 'FAKE'] = 0

In this step we will clean the data that will be used for training. The cleaning will involve these steps- 1.Removing all the extra information like brackets, any kind of puctuations - commas, apostrophes, quotes, question marks, and more. 2.Remove all the numeric text, urls

In [7]:
def wordpre(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text
In [8]:
##  Applying the wordpre method to the dataset
Dataset1['Article']=Dataset1['Article'].apply(wordpre)
In [9]:
#word used in Real news 
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset1[Dataset1.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[9]:
<matplotlib.image.AxesImage at 0x28458edb460>
In [10]:
#word used in Fake news 
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset1[Dataset1.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[10]:
<matplotlib.image.AxesImage at 0x28474209910>

Dataset2¶

In [11]:
Dataset2_true = pd.read_csv("C:/Users/Amrendra Mishra/Desktop/Fake_news_Detection/Dataset/True.csv")
Dataset2_fake = pd.read_csv("C:/Users/Amrendra Mishra/Desktop/Fake_news_Detection/Dataset/Fake.csv")
In [13]:
Dataset2_true.nunique()
Out[13]:
title      20826
text       21192
subject        2
date         716
dtype: int64
In [14]:
Dataset2_fake.nunique()
Out[14]:
title      17903
text       17455
subject        6
date        1681
dtype: int64
In [15]:
#Counting by Subjects in Real news
for key,count in Dataset2_true.subject.value_counts().iteritems():
    print(f"{key}:\t{count}")
    
#Getting Total Rows
print(f"Total Records:\t{Dataset2_true.shape[0]}")
politicsNews:	11272
worldnews:	10145
Total Records:	21417
In [16]:
#Counting by Subjects in Fake news
for key,count in Dataset2_fake.subject.value_counts().iteritems():
    print(f"{key}:\t{count}")
    
#Getting Total Rows
print(f"Total Records:\t{Dataset2_fake.shape[0]}")
News:	9050
politics:	6841
left-news:	4459
Government News:	1570
US_News:	783
Middle-east:	778
Total Records:	23481
In [17]:
#ploting the Subjects in Real news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=Dataset2_true)
plt.show()
In [18]:
#ploting the Subjects in Fake news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=Dataset2_fake)
plt.show()
In [19]:
Dataset2_true['label']= 1
Dataset2_fake['label']= 0
Dataset2 = pd.concat([Dataset2_true, Dataset2_fake])
Dataset2["Article"] = Dataset2["title"] + Dataset2["text"]
Dataset2.sample(frac = 1) #Shuffle 100%
Dataset2 = Dataset2.loc[:,['Article','label']]
In [25]:
Dataset2
Out[25]:
Article label
0 As U.S. budget fight looms, Republicans flip t... 1
1 U.S. military to accept transgender recruits o... 1
2 Senior U.S. Republican senator: 'Let Mr. Muell... 1
3 FBI Russia probe helped by Australian diplomat... 1
4 Trump wants Postal Service to charge 'much mor... 1
... ... ...
23476 McPain: John McCain Furious That Iran Treated ... 0
23477 JUSTICE? Yahoo Settles E-mail Privacy Class-ac... 0
23478 Sunnistan: US and Allied ‘Safe Zone’ Plan to T... 0
23479 How to Blow $700 Million: Al Jazeera America F... 0
23480 10 U.S. Navy Sailors Held by Iranian Military ... 0

44898 rows × 2 columns

In [20]:
##  Applying the wordpre method to the dataset
Dataset2['Article']=Dataset2['Article'].apply(wordpre)
In [56]:
Dataset2.head()
Out[56]:
Article label
0 as u s budget fight looms republicans flip t... 1
1 u s military to accept transgender recruits o... 1
2 senior u s republican senator let mr muell... 1
3 fbi russia probe helped by australian diplomat... 1
4 trump wants postal service to charge much mor... 1
In [21]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset2[Dataset2.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[21]:
<matplotlib.image.AxesImage at 0x28477068040>
In [22]:
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset2[Dataset2.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[22]:
<matplotlib.image.AxesImage at 0x28459128640>

Dataset 3¶

In [23]:
Dataset3_real = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/politifact/politifact_real.csv")
Dataset3_fake = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/politifact/politifact_fake.csv")
In [24]:
Dataset3_real['label']= 1
Dataset3_fake['label']= 0
Dataset3 = pd.concat([Dataset3_real, Dataset3_fake])
Dataset3["Article"] = Dataset3["title"]
Dataset3.sample(frac = 1) #Shuffle 100%
Dataset3 = Dataset3.loc[:,['Article','label']]
In [25]:
Dataset3
Out[25]:
Article label
0 National Federation of Independent Business 1
1 comments in Fayetteville NC 1
2 Romney makes pitch, hoping to close deal : Ele... 1
3 Democratic Leaders Say House Democrats Are Uni... 1
4 Budget of the United States Government, FY 2008 1
... ... ...
427 Who is affected by the government shutdown? 0
428 Lindsey Graham Threatens To Convert To Democra... 0
429 ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A... 0
430 Sarah Palin Calls To Boycott Mall Of America B... 0
431 Account Suspended 0

1056 rows × 2 columns

In [26]:
##  Applying the wordpre method to the dataset
Dataset3['Article']=Dataset3['Article'].apply(wordpre)
In [27]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset3[Dataset3.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[27]:
<matplotlib.image.AxesImage at 0x284590c40a0>
In [28]:
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset3[Dataset3.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[28]:
<matplotlib.image.AxesImage at 0x28458da76a0>

Dataset 4¶

In [29]:
Dataset4 = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/Data-set/train.csv")
In [30]:
Dataset4.head()
Out[30]:
id title author text label
0 0 House Dem Aide: We Didn’t Even See Comey’s Let... Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Let... 1
1 1 FLYNN: Hillary Clinton, Big Woman on Campus - ... Daniel J. Flynn Ever get the feeling your life circles the rou... 0
2 2 Why the Truth Might Get You Fired Consortiumnews.com Why the Truth Might Get You Fired October 29, ... 1
3 3 15 Civilians Killed In Single US Airstrike Hav... Jessica Purkiss Videos 15 Civilians Killed In Single US Airstr... 1
4 4 Iranian woman jailed for fictional unpublished... Howard Portnoy Print \nAn Iranian woman has been sentenced to... 1
In [31]:
Dataset4["Article"] = Dataset4["title"] + Dataset4["text"]
Dataset4.sample(frac = 1) #Shuffle 100%

Dataset4 = Dataset4.loc[:,['Article','label']]
Dataset4 = Dataset4.dropna()
In [32]:
##  Applying the wordpre method to the dataset
Dataset4['Article']=Dataset4['Article'].apply(wordpre)
In [33]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset4[Dataset4.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[33]:
<matplotlib.image.AxesImage at 0x284744a6f10>
In [34]:
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset4[Dataset4.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[34]:
<matplotlib.image.AxesImage at 0x28466504490>

Dataset 5¶

In [35]:
Dataset5 = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/data.csv")
In [36]:
Dataset5
Out[36]:
URLs Headline Body Label
0 http://www.bbc.com/news/world-us-canada-414191... Four ways Bob Corker skewered Donald Trump Image copyright Getty Images\nOn Sunday mornin... 1
1 https://www.reuters.com/article/us-filmfestiva... Linklater's war veteran comedy speaks to moder... LONDON (Reuters) - “Last Flag Flying”, a comed... 1
2 https://www.nytimes.com/2017/10/09/us/politics... Trump’s Fight With Corker Jeopardizes His Legi... The feud broke into public view last week when... 1
3 https://www.reuters.com/article/us-mexico-oil-... Egypt's Cheiron wins tie-up with Pemex for Mex... MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin... 1
4 http://www.cnn.com/videos/cnnmoney/2017/10/08/... Jason Aldean opens 'SNL' with Vegas tribute Country singer Jason Aldean, who was performin... 1
... ... ... ... ...
4004 http://beforeitsnews.com/sports/2017/09/trends... Trends to Watch Trends to Watch\n% of readers think this story... 0
4005 http://beforeitsnews.com/u-s-politics/2017/10/... Trump Jr. Is Soon To Give A 30-Minute Speech F... Trump Jr. Is Soon To Give A 30-Minute Speech F... 0
4006 https://www.activistpost.com/2017/09/ron-paul-... Ron Paul on Trump, Anarchism & the AltRight NaN 0
4007 https://www.reuters.com/article/us-china-pharm... China to accept overseas trial data in bid to ... SHANGHAI (Reuters) - China said it plans to ac... 1
4008 http://beforeitsnews.com/u-s-politics/2017/10/... Vice President Mike Pence Leaves NFL Game Beca... Vice President Mike Pence Leaves NFL Game Beca... 0

4009 rows × 4 columns

In [37]:
Dataset5["Article"] = Dataset5["Headline"] + Dataset5["Body"]
Dataset5["label"] = Dataset5["Label"]
Dataset5.sample(frac = 1) #Shuffle 100%
Dataset5 = Dataset5.loc[:,['Article','label']]
Dataset5 = Dataset5.dropna()
In [38]:
##  Applying the wordpre method to the dataset
Dataset5['Article']=Dataset5['Article'].apply(wordpre)
In [39]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset5[Dataset5.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[39]:
<matplotlib.image.AxesImage at 0x2846a0e8d90>
In [40]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset5[Dataset5.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[40]:
<matplotlib.image.AxesImage at 0x2846a38c310>
In [41]:
#combining all the datset into one
frames = [Dataset1, Dataset2, Dataset3, Dataset4,Dataset5]
Dataset = pd.concat(frames)
In [55]:
Dataset.shape
Out[55]:
(76480, 2)
In [60]:
x_train,x_test,y_train,y_test = train_test_split(Dataset['Article'], Dataset['label'], test_size=0.2, random_state=2020)
In [61]:
x_train.shape
Out[61]:
(61184,)
In [62]:
x_test.shape
Out[62]:
(15296,)
In [64]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')
In [65]:
#LogisticRegression
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

Logisticmodel = pipe.fit(x_train, y_train)
prediction = Logisticmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Logisticmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 87.04%
In [66]:
#####DecisionTreeClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 10, 
                                           splitter='best', 
                                           random_state=2020))])
DecisionTreemodel = pipe.fit(x_train, y_train)
prediction = DecisionTreemodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
DecisionTreemodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 82.07%
In [67]:
#####RandomForestClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier())])

RandomForestmodel = pipe.fit(x_train, y_train) 
prediction = RandomForestmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
RandomForestmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 82.49%
In [68]:
#Stochastic Gradient Descent
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', SGDClassifier())])
SGDmodel = pipe.fit(x_train, y_train)
prediction = SGDmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
SDGmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 86.23%
In [69]:
#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', GradientBoostingClassifier(loss = 'deviance',
                                                   learning_rate = 0.01,
                                                   n_estimators = 10,
                                                   max_depth = 5,
                                                   random_state=55))])

GBCmodel = pipe.fit(x_train, y_train)
prediction = GBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
GBCmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 80.71%
In [70]:
#########XGBClassifier
from xgboost import XGBClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', XGBClassifier(loss = 'deviance',
                                                   learning_rate = 0.01,
                                                   n_estimators = 10,
                                                   max_depth = 5,
                                                   random_state=2020))])

xgboostmodel = pipe.fit(x_train, y_train)
prediction = xgboostmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
xgboostmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
[13:51:37] WARNING: C:\Users\Administrator\workspace\xgboost-win64_release_1.2.0\src\learner.cc:516: 
Parameters: { loss } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


accuracy: 80.75%
In [71]:
#######Multinomial Naive Bayes Classifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', MultinomialNB())])

MNBCmodel = pipe.fit(x_train, y_train)
prediction = MNBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Multinomial_Naive_Bayes_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 78.79%
In [72]:
#############Bernoulli Naive Bayes Classifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', BernoulliNB())])

BNBCmodel = pipe.fit(x_train, y_train)
prediction = BNBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Bernoulli_Naive_Bayes_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 76.08%
In [73]:
x = ["SDGmodel_accuracy", "Logisticmodel_accuracy", "GBCmodel_accuracy", "xgboostmodel_accuracy" , 
     "DecisionTreemodel_accuracy","RandomForestmodel_accuracy","Multinomial_Naive_Bayes_accuracy",
     "Bernoulli_Naive_Bayes_accuracy"]
y =  [SDGmodel_accuracy,Logisticmodel_accuracy,GBCmodel_accuracy,xgboostmodel_accuracy,
      DecisionTreemodel_accuracy,RandomForestmodel_accuracy,Multinomial_Naive_Bayes_accuracy,
      Bernoulli_Naive_Bayes_accuracy]
plt.barh(x, y)

for index, value in enumerate(y):
    plt.text(value, index, str(value))
In [77]:
import joblib  
# Save the model as a pickle in a file 
joblib.dump(Logisticmodel, 'model.pkl')  
Out[77]:
['model.pkl']